1. Identifying Successful Projects

a) Success by Category

library(dplyr)
library(ggplot2)

sub_ks <- ks %>% 
  filter(backers_count > mean(backers_count)) %>%
  filter(state == "successful") %>% 
  group_by(top_category) %>% 
  count()

plot1a <- ggplot(sub_ks, aes(x = reorder(top_category, -n), y = n)) +
  geom_bar(stat = "identity", fill = "lightskyblue1") +
  coord_flip() +
  xlab(NULL) +
  ylab(NULL) +
  ggtitle("Success by Category") +
  geom_text(aes(label = n), color = "firebrick4")

plot1a

I subset the dataset to include only observations with number of backers that is greater than the average and are classified as “successful” in the state category. I used these two criteria to measure success among these kickstarter categories. The top 3 categories that are most likely to succeed are music, technology, and publishing while the least successful ones are dance, journalism, and theater. The top 5 successful categories all have over 2200 successful observations while the 5 least successful categories have less than 350 observations.

b) Success by Location - By State

library(leaflet)
library(rgdal)
library(RColorBrewer)

us_state <- read.delim("~/Desktop/QMSS - CU/Spring 2020/GR5063_DataViz/course_materials/exercises/09_kickstarter/us_state.txt", 
                       header=FALSE) %>% 
  rename(NAME = V2,
         pop = V3,
         location_state = V4)
ks_state <- ks %>% 
  filter(backers_count > mean(backers_count)) %>%
  filter(state == "successful") %>% 
  group_by(location_state) %>% 
  count()
ks_state <- left_join(ks_state, us_state)
us <- readOGR("us_state_500k.json", verbose = FALSE)
ks_state <- left_join(ks_state, us@data)

# Success by State
pal <- colorNumeric("YlOrRd", domain = ks_state$n)

map1b <- leaflet() %>% 
  addTiles() %>% 
  setView(lng = -110.389215, lat = 46.820186, zoom = 2.5) %>% 
  addPolygons(data = us,
              fillColor = ~pal(ks_state$n),
              fillOpacity = 1) %>% 
  addLegend(pal = pal,
            values = ks_state$n)
map1b
# Normalized by population estimate on July 1st 2016
ks_state$npop <- as.numeric(ks_state$pop) / ks_state$n
pal <- colorNumeric("Spectral", domain = ks_state$npop)

map1bpop <- leaflet() %>% 
  addTiles() %>% 
  setView(lng = -110.389215, lat = 46.820186, zoom = 2.5) %>% 
  addPolygons(data = us,
              fillColor = ~pal(ks_state$npop),
              fillOpacity = 1) %>% 
  addLegend(pal = pal,
            values = ks_state$npop)
map1bpop

The smaller number is associated with darker color, which means less people are involved for each successful project so that this state is more innovative.

Innovative US Cities

ks_city <- ks %>% 
  filter(backers_count > mean(backers_count)) %>%
  filter(state == "successful") %>% 
  group_by(location_town) %>% 
  count() %>% 
  arrange(desc(n)) %>% 
  filter(n > 52)

2. Writing your success story

a) Cleaning the Text and Word Cloud

library(tm)
library(SnowballC)

successful_df <- ks %>% 
  filter(backers_count > mean(backers_count)) %>%
  filter(state == "successful") %>% 
  arrange(desc(converted_pledged_amount)) %>% 
  slice(1:1000)

# Remove fully capitalized words
successful <- trimws(gsub("\\b[A-Z]+\\b", "", successful_df$blurb))

successful_corpus <- Corpus(VectorSource(successful))

unsuccessful_df <- ks %>% 
  filter(state == "failed") %>% 
  arrange(converted_pledged_amount) %>% 
  slice(1:1000)

unsuccessful <- trimws(gsub("\\b[A-Z]+\\b", "", unsuccessful_df$blurb))

unsuccessful_corpus <- Corpus(VectorSource(unsuccessful))
removeNumPunct <- function(x){gsub("[^[:alpha:][:space:]]*", "", x)}

clean_corpus <- function(corpus){
  corpus <- tm_map(corpus, removePunctuation)
  corpus <- tm_map(corpus, content_transformer(tolower))
  corpus <- tm_map(corpus, removeWords, c(stopwords("en")))
  corpus <- tm_map(corpus, removeNumbers)
  corpus <- tm_map(corpus, content_transformer(removeNumPunct))
  corpus <- tm_map(corpus, stripWhitespace)
  return(corpus)
}
sc_clean <- clean_corpus(successful_corpus)
uc_clean <- clean_corpus(unsuccessful_corpus)

sc_stem <- tm_map(sc_clean, stemDocument)
uc_stem <- tm_map(uc_clean, stemDocument)


stemCompletion2 <- function(x, dictionary) {
   x <- unlist(strsplit(as.character(x), " "))
   x <- x[x != ""]
   x <- stemCompletion(x, dictionary=dictionary)
   x <- paste(x, sep="", collapse=" ")
   PlainTextDocument(stripWhitespace(x))
}

sc_comp <- lapply(sc_stem, stemCompletion2, dictionary = sc_clean)
uc_comp <- lapply(uc_stem, stemCompletion2, dictionary = uc_clean)

sc_comp <- as.VCorpus(sc_comp)
uc_comp <- as.VCorpus(uc_comp)

# Create DTM
sc_dtm <- DocumentTermMatrix(sc_comp)
uc_dtm <- DocumentTermMatrix(uc_comp)
# Attach clean, stemmed, and completed blurb back to the dataframe - for later use
for (p in 1:1000){
  successful_df$clean_blurb[p] <- sc_comp[[p]][[1]]
}

for (p in 1:1000){
  unsuccessful_df$clean_blurb[p] <- uc_comp[[p]][[1]]
}
library(wordcloud)

# successful
m <- as.matrix(sc_dtm)
v <- sort(colSums(m),decreasing = TRUE)
d <- data.frame(word = names(v),freq = v)

set.seed(1234)
wordcloud(words = d$word, freq = d$freq, min.freq = 25,
          max.words = 200, random.order = FALSE, rot.per = 0.35, 
          colors = brewer.pal(8, "Dark2"))

# not successful
m2 <- as.matrix(uc_dtm)
v2 <- sort(colSums(m2),decreasing = TRUE)
d2 <- data.frame(word = names(v2),freq = v2)

set.seed(1234)
wordcloud(words = d2$word, freq = d2$freq, min.freq = 25,
          max.words = 200, random.order = FALSE, rot.per = 0.35, 
          colors = brewer.pal(8, "Dark2"))

b) Success in Words

library(plotrix)

sc_word <- head(d, 50)
uc_word <- d2 %>% 
  filter(word %in% sc_word$word) %>% 
  rename(uc_cnt = freq)

common_word <- left_join(uc_word, sc_word)
common_word <- common_word[1:20, ]

plot2b <- pyramid.plot(common_word$freq, common_word$uc_cnt,
                       labels = common_word$word, gap = 10, 
                       top.labels = c("Successful", " ", "Unsuccessful"),
                       main = "Words in Common", laxlab = NULL, 
                       raxlab = NULL, unit = NULL, labelcex = 0.5)

c) Simplicity as A Virtue

library(quanteda)

ks_sim <- ks %>%
  select(backers_count, blurb, state) %>%
  filter(backers_count < 1000) %>%
  filter(state %in% c("successful", "failed")) %>%
  slice(1:1000)

ks_sim$n <- textstat_readability(as.character(ks_sim$blurb),
                     measure = 'Flesch.Kincaid')

plot2c <- ggplot(ks_sim, aes(x = ks_sim$n$Flesch.Kincaid,
                             y = backers_count,
                             color = state)) +
  geom_point() +
  geom_smooth(method = "lm") +
  xlim(-4, 20) +
  xlab("Flesch-Kincaid Readability Level") +
  ylab("Number of Backers") +
  facet_grid(~state)

plot2c

successful_df$clean_blurb <- ifelse(successful_df$clean_blurb == "",
                                    as.character(successful_df$blurb),
                                    successful_df$clean_blurb)

successful_df$n <- textstat_readability(successful_df$clean_blurb, 
                     measure = 'Flesch.Kincaid')

plot2c1 <- ggplot(successful_df, aes(x = successful_df$n$Flesch.Kincaid, 
                             y = backers_count)) +
  geom_point() +
  geom_smooth(method = "lm") +
  xlim(-4, 20) +
  ylim(0, 10000) +
  xlab("Flesch-Kincaid Readability Level") +
  ylab("Number of Backers")

plot2c1

unsuccessful_df$clean_blurb <- ifelse(unsuccessful_df$clean_blurb == "",
                                    as.character(unsuccessful_df$blurb),
                                    unsuccessful_df$clean_blurb)

unsuccessful_df$n <- textstat_readability(as.character(unsuccessful_df$blurb), 
                     measure = 'Flesch.Kincaid')

plot2c2 <- ggplot(unsuccessful_df, aes(x = unsuccessful_df$n$Flesch.Kincaid, 
                             y = backers_count)) +
  geom_point() +
  geom_smooth(method = "lm") +
  xlab("Flesch-Kincaid Readability Level") +
  ylab("Number of Backers")

plot2c2

For plot2c, I only use a subset of 1000 observations of the dataset that are either successful or failed and have less than 1000 backers. I use the Flesc-Kincaid readability test to measure simplicity of the blurbs. The plot is also color coded with the blue dots representing successful projects. The projects that failed are red and they all have very a small number of backers than the successful ones. In general, for both successful and failed projects, if the readability of their blurbs is of higher level, they are more likely to have a larger number of backers.

As for the dataset with 1000 most successful projects, readability is not positively associated with number of backers and this association is even negative, which shows that simplicity of the blurbs is not very important in determining the success of the projects. As for the least successful projects, they rearely have any backers so the association of readability is not prominent.

3. Sentiment

a) Stay Positive - Bing

library(tidytext)

s_sent1 <- successful_df %>%
  select(clean_blurb) %>%
  mutate(RowNum = row_number()) %>%
  unnest_tokens(word, clean_blurb) %>%
  inner_join(get_sentiments("bing"))

for (p in 1:1000){
  pos_count <- s_sent1 %>%
    filter(RowNum == p) %>%
    filter(sentiment == "positive") %>%
    count()
  
  neg_count <- s_sent1 %>%
    filter(RowNum == p) %>%
    filter(sentiment == "negative") %>%
    count()
  
  successful_df$sent_bing[p] <- (pos_count - neg_count) / (pos_count + neg_count)
}

plot3a1 <- ggplot(successful_df, aes(x = as.factor(round(as.numeric(sent_bing), 2)), y = pledged, fill = as.factor(round(as.numeric(sent_bing), 2)))) +
  geom_boxplot() +
  xlab("Sentiment Score") +
  ylab("Pledged Amount ($)") +
  ylim(0, 5000000) +
  theme(legend.position = "none")

plot3a1

s_sent2 <- unsuccessful_df %>%
  select(clean_blurb) %>%
  mutate(RowNum = row_number()) %>%
  unnest_tokens(word, clean_blurb) %>%
  inner_join(get_sentiments("bing"))

for (p in 1:1000){
  pos_count <- s_sent2 %>%
    filter(RowNum == p) %>%
    filter(sentiment == "positive") %>%
    count()
  
  neg_count <- s_sent2 %>%
    filter(RowNum == p) %>%
    filter(sentiment == "negative") %>%
    count()
  
  unsuccessful_df$sent_bing[p] <- (pos_count - neg_count) / (pos_count + neg_count)
}

unsuccessful_df$sent_bing <- as.factor(round(as.numeric(unsuccessful_df$sent_bing), 2))

f_sent <- unsuccessful_df %>%
  select(sent_bing) %>%
  group_by(sent_bing) %>% 
  count()
  
  
plot3a2 <- ggplot(f_sent, aes(x = sent_bing, y = n, fill = sent_bing)) +
  geom_bar(stat = "identity") +
  xlab("Sentiment Score") +
  ylab("Number of Failed Project") +
  # ylim(0, 5000000) +
  theme(legend.position = "none") +
  geom_text(aes(label = n), color = "black")

plot3a2

I use the Bing sentiments from tidytext to count positive and negative words. The sentiment score is calculated by (pos_count - neg_count) / (pos_count + neg_count). The higher score indicates a more positive tone of the blurb. For the 1000 most successful projects, I use boxplots to show the distribution of pledged amount for each sentiment score. Overall, the positive blurb is associated with higher median of pledged amount than the negative blurb. The successful projects that have most outliers have a sentiment score of 1, and, interestingly, these outliers all have very large pledged amount. Pledged amounts are all zero for the 1000 failed projects. Therefore, as for the second plot, I display the count for failed blurbs of different sentiment score, most of which are scored 1. For projects that failed, tone does not matter as much.

b) Positive vs Negative

successful_df$sent_bing <- as.numeric(successful_df$sent_bing)

unsuccessful_df$sent_bing <- as.numeric(unsuccessful_df$sent_bing)

# Positive DTM
s_pos <- successful_df %>% 
  na.omit(sent_bing) %>% 
  filter(sent_bing > 0) %>% 
  select(clean_blurb) 

f_pos <- unsuccessful_df %>% 
  na.omit(sent_bing) %>% 
  filter(sent_bing > 0) %>% 
  select(clean_blurb) 

p_blurb <- rbind(s_pos, f_pos)

p_sent <- p_blurb %>%
  select(clean_blurb) %>%
  unnest_tokens(word, clean_blurb) %>%
  inner_join(get_sentiments("bing")) %>% 
  filter(sentiment == "positive") %>% 
  select(word)


# Negative DTM
s_neg <- successful_df %>% 
  na.omit(sent_bing) %>% 
  filter(sent_bing <= 0) %>% 
  select(clean_blurb) 

f_neg <- unsuccessful_df %>% 
  na.omit(sent_bing) %>% 
  filter(sent_bing <= 0) %>% 
  select(clean_blurb) 

n_blurb <- rbind(s_neg, f_neg)

n_sent <- n_blurb %>%
  select(clean_blurb) %>%
  unnest_tokens(word, clean_blurb) %>%
  inner_join(get_sentiments("bing")) %>% 
  filter(sentiment == "negative") %>% 
  select(word)

pn_corpus <- Corpus(VectorSource(c(p_sent, n_sent)))
pn_tdm <- TermDocumentMatrix(pn_corpus)
pn_tdm_m <- as.matrix(pn_tdm)

set.seed(1234)
comparison.cloud(pn_tdm_m, colors = c("red", "blue"), 
                 scale=c(0.1, 2), title.size= 1, 
                 max.words = 100)

The top half of the comparison cloud, the red part, represents words of the positive text while the bottom half, the blue part, represent words of the negative text.

c) Get in Their Mind

var <- c("backers_count", "clean_blurb", "state", "pledged")
sf_sub <- rbind(successful_df[ , var], unsuccessful_df[ ,var])

sf_sent <- sf_sub %>% 
  select(clean_blurb, state) %>% 
  unnest_tokens(word, clean_blurb) %>% 
  inner_join(get_sentiments("nrc")) %>% 
  group_by(state) %>% 
  count(sentiment)

neg_sum <- sum(sf_sent$n[1:10])
pos_sum <- sum(sf_sent$n[11:20])

sf_perc <- sf_sent %>% 
  mutate(prop = case_when(state == "failed" ~ n / neg_sum,
                   TRUE ~ n / pos_sum))

plot3c <- ggplot(sf_perc, aes(x = sentiment, y = prop, fill = state)) +
  geom_bar(stat = "identity", position = "dodge") + 
  coord_flip() +
  ylab(NULL) +
  xlab(NULL)

plot3c

I selected all the words that meet the 10 NRC emotion categories and create a column called “prop” to represent the proportion of words in each emotion categories among all words whose state are either successful or failed. For successful projects, positive emotions like trust and positive have higher proportion than failed projects. For failed projects, emotions like sadness and disgust have higher proportion than successful projects. However, positive emotions like joy and anticipation are more common for failed projects while negative and anger are more common for successful projects. Therefore, it would be biased to conclude that positive emotions are dominating the successful projects.